Data Wrangling COVID-19 Mini-Challenge

Imports/Setup

1 import pandas as pd
2 import wikidata_plain_sparql as wikidata
3
4 from bokeh.palettes import turbo
5 from bokeh.plotting import figure, output_notebook, show, gridplot
6
7 # set bokeh output mode to notebook
8 output_notebook()
9
10 from helper import get_jhu_cached, create_grid

Loading BokehJS ...

Worldwide COVID-19 Data

1 # get all dates from earliest available data until yesterday
2 all_dates = pd.date_range(start='2020-01-22', end='today')[:-1]
3
4 # standardize column names for all entries
5 def rename_columns(column):
6 column_map = {
7 'Lat': 'Latitude',
8 'Long_': 'Longitude',
9 'Incidence_Rate': 'Incident_Rate'
10 }
11 if column in column_map:
12 return column_map[column]
13 return column.replace('/', '_').replace('-', '_').replace(' ', '_')
14
15 # load data from all dates
16 all_data = []
17 for date in all_dates:
18 data = pd.read_csv(get_jhu_cached(date))
19 data.rename(columns=rename_columns, inplace=True)
20 data['Date'] = date
21 all_data.append(data)
22
23 # combine data from all days
24 raw_data = pd.concat(all_data)
1 # standardize country names for all entries
2 country_mapping = {
3 'MS Zaandam|Diamond Princess|Cruise Ship': 'Others', # move cruise ships to others
4 'Hong Kong.+': 'Hong Kong',
5 'Iran.+': 'Iran',
6 '.*Congo.*': 'Congo',
7 'Mainland China': 'China',
8 '.*Bahamas.*': 'The Bahamas',
9 '.*Gambia.*': 'The Gambia',
10 'Viet Nam': 'Vietnam',
11 'Taiwan\*': 'Taiwan',
12 'Cote d\'Ivoire': 'Ivory Coast',
13 'Cabo Verde': 'Cape Verde',
14 'Russian Federation': 'Russia',
15 ' Azerbaijan': 'Azerbaijan',
16 'Holy See': 'Vatican City',
17 'Republic of Ireland': 'Ireland',
18 'Republic of Moldova': 'Moldova',
19 'Czechia': 'Czech Republic',
20 'Republic of Korea|Korea, South': 'South Korea',
21 'Timor-Leste': 'East Timor',
22 'Macao SAR|Macau': 'Macao',
23 'UK': 'United Kingdom',
24 'Jersey|Guernsey': 'Channel Islands',
25 'Dominican Republicn Republic|Dominica': 'Dominican Republic'
26 }
27
28 raw_data['Country_Region'].replace(to_replace=country_mapping.keys(), value=country_mapping.values(), regex=True, inplace=True)
1 # group data by country
2 daily_updates = raw_data.groupby(['Country_Region', 'Date']).agg(
3 Confirmed = ('Confirmed','sum'),
4 Deaths = ('Deaths','sum'),
5 ).reset_index()
6
7 # get changes in data
8 updates_per_country = daily_updates.groupby('Country_Region').diff()
9 daily_updates['New_Confirmed'] = updates_per_country['Confirmed'].diff().fillna(0)
10 daily_updates['New_Deaths'] = updates_per_country['Deaths'].diff().fillna(0)
11
12 worldwide_pretty = daily_updates.loc[:, ['Date', 'Country_Region', 'New_Confirmed', 'Confirmed', 'New_Deaths', 'Deaths']]
1 # get all countries incl. colors
2 all_countries = daily_updates['Country_Region'].sort_values().unique()
3 colors = turbo(all_countries.size)
1 # show new cases
2 new_cases_graph = figure(title='New COVID-19 cases per country', y_axis_label='new cases', x_axis_type='datetime', sizing_mode='stretch_width')
3
4 i = 0
5 for country in all_countries:
6 updates_for_country = daily_updates.loc[daily_updates['Country_Region'] == country]
7 new_cases_graph.line(updates_for_country['Date'], updates_for_country['New_Confirmed'], line_color=colors[i], legend_label=country, line_width=2)
8 i += 1
9 show(new_cases_graph)

1 # show total cases
2 total_cases_graph = figure(title='Total COVID-19 cases per country', y_axis_label='total cases', x_axis_type='datetime', sizing_mode='stretch_width')
3
4 i = 0
5 for country in all_countries:
6 updates_for_country = daily_updates.loc[daily_updates['Country_Region'] == country]
7 total_cases_graph.line(updates_for_country['Date'], updates_for_country['Confirmed'], line_color=colors[i], legend_label=country, line_width=2)
8 i += 1
9 show(total_cases_graph)

Swiss COVID-19 Data

1 # get population data from WikiData
2 canton_data = wikidata.query('''
3 SELECT ?shortCode ?population ?canton WHERE {
4 ?canton wdt:P31 wd:Q23058.
5 ?canton wdt:P300 ?shortCode.
6 OPTIONAL {
7 ?canton p:P1082 ?population_stmt.
8 ?population_stmt ps:P1082 ?population.
9 ?population_stmt pq:P585 ?population_date.
10 }
11 FILTER NOT EXISTS {
12 ?canton p:P1082/pq:P585 ?population_date_.
13 FILTER (?population_date_ > ?population_date)
14 }
15 }
16 ORDER BY ?shortCode
17 ''')
18 canton_data.set_index('shortCode', inplace=True)
1 raw_data = pd.read_csv('https://raw.githubusercontent.com/openZH/covid_19/master/COVID19_Fallzahlen_CH_total_v2.csv')
2
3 # convert to date
4 raw_data['date'] = pd.to_datetime(raw_data['date'])
5
6 # remove FL
7 swiss = raw_data[raw_data['abbreviation_canton_and_fl'] != 'FL']
8
9 # only use data after 1st of june
10 swiss = swiss[swiss['date'] >= '2020-05-31']
11
12 # only keep useful entries
13 conf_cases = swiss.loc[-swiss['ncumul_conf'].isna()].copy()
14
15 # get all cantons
16 all_cantons = conf_cases['abbreviation_canton_and_fl'].unique()
17 all_cantons.sort()
18
19 # calculate new cases
20 conf_cases['new_cases'] = 0
21
22 for canton in all_cantons:
23 updates_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
24 previous_index = None
25 total_cases = 0
26 for index in updates_for_canton.index:
27 if previous_index != None:
28 new_cases = conf_cases.at[index, 'ncumul_conf'] - conf_cases.at[previous_index, 'ncumul_conf']
29 conf_cases.at[index, 'new_cases'] = new_cases
30 conf_cases.at[index, 'new_cases_relative'] = new_cases / int(canton_data.at['CH-' + canton, 'population']) * 100000
31 total_cases += new_cases
32 conf_cases.at[index, 'total_cases'] = total_cases
33 conf_cases.at[index, 'total_cases_relative'] = total_cases / int(canton_data.at['CH-' + canton, 'population']) * 100000
34 conf_cases.at[index, 'new_deaths'] = conf_cases.at[index, 'ncumul_deceased'] - conf_cases.at[previous_index, 'ncumul_deceased']
35 previous_index = index
36
37 conf_cases = conf_cases[conf_cases['date'] >= '2020-06-01']
38
39 swiss_pretty = conf_cases.loc[:, ['date', 'abbreviation_canton_and_fl', 'new_cases', 'total_cases', 'new_deaths']]

New COVID-19 cases per 100'0000 residents

1 # calculate new cases
2 graphs = []
3 max_new_cases = conf_cases['new_cases_relative'].max()
4 for canton in all_cantons:
5 update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton].copy()
6
7 update_for_canton['new_cases_relative_avg'] = update_for_canton['new_cases_relative'].rolling(window=7).mean()
8
9 new_cases_graph = figure(title=canton, y_axis_label='new cases', y_range=[0, max_new_cases], x_axis_type='datetime')
10 new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative'], line_width=1)
11 new_cases_graph.line(update_for_canton['date'], update_for_canton['new_cases_relative_avg'], line_color='red', line_width=1)
12 graphs.append(new_cases_graph)
13
14 show(create_grid(graphs, sizing_mode='scale_width'))

Total COVID-19 cases since 1st of june per 100'000 residents

1 # calculate total cases
2 graphs = []
3 max_total_cases = conf_cases['total_cases_relative'].max()
4 for canton in all_cantons:
5 update_for_canton = conf_cases.loc[conf_cases['abbreviation_canton_and_fl'] == canton]
6
7 total_cases_graph = figure(title=canton, y_axis_label='total cases', y_range=[0, max_total_cases], x_axis_type='datetime')
8 total_cases_graph.line(update_for_canton['date'], update_for_canton['total_cases_relative'], line_width=1)
9 graphs.append(total_cases_graph)
10
11 show(create_grid(graphs, sizing_mode='scale_width'))

Final Data Frames

1 worldwide_pretty

Date Country_Region New_Confirmed Confirmed New_Deaths Deaths
0 2020-02-24 Afghanistan 0.0 1.0 0.0 0.0
1 2020-02-25 Afghanistan 0.0 1.0 0.0 0.0
2 2020-02-26 Afghanistan 0.0 1.0 0.0 0.0
3 2020-02-27 Afghanistan 0.0 1.0 0.0 0.0
4 2020-02-28 Afghanistan 0.0 1.0 0.0 0.0
... ... ... ... ... ... ...
79632 2020-03-12 occupied Palestinian territory 25.0 0.0 0.0 0.0
79633 2020-03-14 occupied Palestinian territory 0.0 0.0 0.0 0.0
79634 2020-03-15 occupied Palestinian territory 0.0 0.0 0.0 0.0
79635 2020-03-16 occupied Palestinian territory 0.0 0.0 0.0 0.0
79636 2020-03-17 occupied Palestinian territory 0.0 0.0 0.0 0.0

79637 rows × 6 columns

1 swiss_pretty

date abbreviation_canton_and_fl new_cases total_cases new_deaths
2346 2020-06-01 BL 1 1.0 0.0
2347 2020-06-01 FR 0 0.0 0.0
2348 2020-06-01 GE 1 1.0 0.0
2349 2020-06-01 GR 0 0.0 0.0
2350 2020-06-01 JU 0 0.0 0.0
... ... ... ... ... ...
10709 2021-05-02 NW 3 2340.0 0.0
10710 2021-05-02 TI 38 28937.0 0.0
10711 2021-05-02 BE 116 60502.0 1.0
10712 2021-05-02 SZ 26 10899.0 0.0
10713 2021-05-02 BS 73 11126.0 0.0

7839 rows × 5 columns

1